#install.packages("mlbench")
#install.packages("C50")
#install.packages("magrittr")
#install.packages("ROSE")
#install.packages("rpart")
library(caret)
library(mlbench)
library(C50)
library(dplyr)
library(plotly)
library(caret)
library(ROSE)
library(rpart)
library(GGally)

Setting up workspace

setwd("~/git/data-analysis/lab03/")

Loading DATA

Our data frame will be the train.csv file, in which we’ll peform predictions models and test.csv will be used to Caggle challenge.

data <- read.csv("data/all/train.csv")
test.kaggle <- read.csv("data/all/test.csv")

Here we gonna see the correlation between the variables, then will se the ones which has a strong correlation and remove, because keep both would be redundant for our prediction model.

data.correlation1 <- data %>% select(-c(sequencial_candidato, nome, estado_civil, ano, cargo))

data.correlation <- data.correlation1  %>%
  mutate(situacao = as.factor(situacao)) %>%
  mutate(uf = as.factor(uf)) %>%
  mutate(partido = as.factor(partido)) %>%
  mutate(sexo = as.factor(sexo)) %>%
  mutate(grau = as.factor(grau)) %>%
  mutate(ocupacao = as.factor(ocupacao))

data.correlation %>% 
  select(-partido,
         -uf,-grau,-sexo) %>%
  na.omit() %>%
  ggcorr(palette = "RdBu",
         color = "grey50",
         label = TRUE, hjust = 1,
         label_size = 3, size = 4,
         nbreaks = 5, layout.exp = 7) +
  ggtitle("Gráfico de correlação eleições 2006")

We choosed to remove those three categoric variables in order to run the model, otherwise it would take too much time. But for a better result you could let them on the data. And also remove those variable which have strong correlation

data <- data %>%
  select(-cargo, -nome, -ocupacao, -sexo, -total_despesa, -total_receita, -sequencial_candidato )
test.kaggle <- test.kaggle %>%
  select(-cargo, -nome, -ocupacao, total_despesa, -total_receita)

In the data would be better replace the NA for the column media, but we choosed replace by zero.

data[is.na(data)] <- 0
test.kaggle[is.na(test.kaggle)] <- 0

As our target is to predict the variable situacao we need to see if our data is balanced, so what is the class distribution?

data_class_destribution <- data %>% group_by(situacao) %>% summarize(class_count = n())
p <- plot_ly(data_class_destribution, x = ~situacao, y = ~class_count, type = 'bar',
        marker = list(color = c('rgba(204,204,204,1)', 'rgba(222,45,38,0.8)'))) %>%
  layout(title = "Class Balance",
         xaxis = list(title = "Situation"),
         yaxis = list(title = "Count"))
p

cleary unbalanced!

So what should we do? We gonna balance it.

There is some ways to balance data which are:

  1. Undersampling That method reduces the number of observation from the majoritary class in order to balance the data set.
  1. Oversampling This method increase the number of observation from the minoritary class and make it balanced.
  1. Both Sampling Here it uses the technique 1 and 2 to make the data set balanced
  1. ROSE Sampling Data synthetic generation and it provades a better stimation of original data.

Before balance it we gonna do a experiment. Let’s create a model and see how is goes whitout balance in order to predict and see accuracy to compare in the future

For tu build our models we gonna need data to train and test so we’ll divid the original data into train and test, 70% to raing and 30% to test.

set.seed(42)
index <- createDataPartition(data$situacao, p = 0.7, list = FALSE)
unbalanced.train <- data[index, ]
unbalanced.test <- data[-index, ]

Decision Tree whit unbalanced data

treeimb <- rpart(situacao ~ ., data = unbalanced.train)
pred.treeimb <- predict(treeimb, newdata = unbalanced.test)

accuracy.meas(unbalanced.test$situacao, pred.treeimb[,2])
## 
## Call: 
## accuracy.meas(response = unbalanced.test$situacao, predicted = pred.treeimb[, 
##     2])
## 
## Examples are labelled as positive when predicted is greater than 0.5 
## 
## precision: 0.955
## recall: 0.949
## F: 0.476
roc.curve(unbalanced.test$situacao, pred.treeimb[,2], plotit = F)
## Area under the curve (AUC): 0.890

Surprisely we’ve got a good precision and recall. Anyways let’s see how it goes whit balabced data.

Lets balance it, all data by using the 4 method ROSE Sampling which it gonna generate syntetich data.

data.rose <- ROSE(situacao ~ ., data = data, seed = 1)$data
table(data.rose$situacao)
## 
## nao_eleito     eleito 
##       3842       3780

YEAH!

It looks pretty balanced now. That is great, so we now gonna peform some models and avaliate its metrics.

Yes, we need to particionate our balanced data now, using the same schema before.

set.seed(42)
index <- createDataPartition(data.rose$situacao, p = 0.7, list = FALSE)
train <- data.rose[index, ]
test <- data.rose[-index, ]

knn

First model is Knn.

k-nearest neighbour classification for test set from training set. For each row of the test set, the k nearest (in Euclidean distance) training set vectors are found, and the classification is decided by majority vote, with ties broken at random. If there are ties for the kth nearest vector, all candidates are included in the vote.

fitControl <- trainControl(method = "repeatedcv", 
                           number = 10,
                           repeats = 10)

preProcess = c("center", "scale","nzv" )
model.knn <- train(situacao ~ ., 
               data = train,
               trControl = fitControl,
               method = "knn", # pode ser 'lasso'ldf
               metric = "Accuracy",
               preProcess = preProcess)

model.knn
## k-Nearest Neighbors 
## 
## 5336 samples
##   16 predictors
##    2 classes: 'nao_eleito', 'eleito' 
## 
## Pre-processing: centered (29), scaled (29), remove (49) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 4802, 4803, 4803, 4802, 4802, 4802, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7952580  0.5893739
##   7  0.7834694  0.5656935
##   9  0.7798721  0.5584564
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
knn_prediction <- predict(model.knn,test)

knn_data <- data.frame(pred = knn_prediction, obs = test$situacao)

knn_cv <- round(defaultSummary(knn_data),digits = 4)

knn_cv
## Accuracy    Kappa 
##   0.7940   0.5868

Logistic Regression

Second model to be build. That model aims to fit a regression curve, y= f(x), when y is a categorical variable.

model.logistic_reg <- train(situacao ~ ., 
               data = train,
               trControl = fitControl,
               method = 'LogitBoost', 
               metric = "Accuracy",
               preProcess = preProcess)

model.logistic_reg
## Boosted Logistic Regression 
## 
## 5336 samples
##   16 predictors
##    2 classes: 'nao_eleito', 'eleito' 
## 
## Pre-processing: centered (29), scaled (29), remove (49) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 4803, 4802, 4802, 4802, 4803, 4803, ... 
## Resampling results across tuning parameters:
## 
##   nIter  Accuracy   Kappa    
##   11     0.9309596  0.8619063
##   21     0.9632113  0.9264178
##   31     0.9675785  0.9351519
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was nIter = 31.
logistic_reg_prediction <- predict(model.logistic_reg,test)

logistic_reg_data <- data.frame(pred = logistic_reg_prediction, obs = test$situacao)

logistic_reg_cv <- round(defaultSummary(logistic_reg_data),digits = 4)

logistic_reg_cv
## Accuracy    Kappa 
##   0.9659   0.9318

Decision Tree

Third model Decision tree is a graph to represent choices and their results in form of a tree. The nodes in the graph represent an event or choice and the edges of the graph represent the decision rules or conditions.

new_index <- createDataPartition(data.rose$situacao, p = 0.7, list = FALSE)
new_train_data <- data.rose[index, ]
new_test_data  <- data.rose[-index, ]


new_treeimb <- rpart(situacao ~ ., data = new_train_data)
new_pred.treeimb <- predict(new_treeimb, newdata = new_test_data)


accuracy.meas(new_test_data$situacao, new_pred.treeimb[,2])
## 
## Call: 
## accuracy.meas(response = new_test_data$situacao, predicted = new_pred.treeimb[, 
##     2])
## 
## Examples are labelled as positive when predicted is greater than 0.5 
## 
## precision: 0.891
## recall: 0.970
## F: 0.465
model.tree_dec <- train(situacao ~ .,
                data= train, 
                method = "rpart",
                trControl = fitControl,
                cp=0.001,  
                metric = "Accuracy",
                maxdepth=20)
model.tree_dec
## CART 
## 
## 5336 samples
##   16 predictors
##    2 classes: 'nao_eleito', 'eleito' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 4802, 4802, 4803, 4803, 4802, 4802, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05442177  0.8901236  0.7802011
##   0.15873016  0.8350820  0.6695660
##   0.59372638  0.6366796  0.2684000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05442177.
tree_prediction <- predict(model.tree_dec,test)

tree_data <- data.frame(pred = tree_prediction, obs = test$situacao)

tree_cv <- round(defaultSummary(tree_data),digits = 4)

tree_cv
## Accuracy    Kappa 
##   0.8644   0.7286

AdaBoost

Boosting is an ensemble technique that attempts to create a strong classifier from a number of weak classifiers.

model.adaboost <- train(situacao ~ ., 
               data = data.rose,
               trControl = trainControl(method = "repeatedcv", 
                           number = 10,
                           repeats = 5),
               method = 'adaboost', 
               metric = "Accuracy",
               preProcess = preProcess)

model.adaboost
## AdaBoost Classification Trees 
## 
## 7622 samples
##   16 predictors
##    2 classes: 'nao_eleito', 'eleito' 
## 
## Pre-processing: centered (29), scaled (29), remove (49) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 6860, 6860, 6859, 6860, 6860, 6860, ... 
## Resampling results across tuning parameters:
## 
##   nIter  method         Accuracy   Kappa    
##    50    Adaboost.M1    0.9900286  0.9800577
##    50    Real adaboost  0.9873258  0.9746551
##   100    Adaboost.M1    0.9908683  0.9817371
##   100    Real adaboost  0.9880083  0.9760199
##   150    Adaboost.M1    0.9912096  0.9824195
##   150    Real adaboost  0.9881919  0.9763869
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nIter = 150 and method
##  = Adaboost.M1.
adaboost_prediction <- predict(model.adaboost,test)

adaboost_data <- data.frame(pred = adaboost_prediction, obs = test$situacao)

adaboost_cv <- round(defaultSummary(adaboost_data),digits = 4)

adaboost_cv
## Accuracy    Kappa 
##        1        1

Which atribuite are most important to each model

As far we can see for ano and sexo we’ve a low outcome for importance, so those variables should be removed. As well the best variables pointe for all model are recursos_de_pessoas_juridicas, recursos_de_pessoas_fisicas and the other differ in the order.

KNN

varImp(model.knn)
## ROC curve variable importance
## 
##                                       Importance
## recursos_de_pessoas_juridicas             100.00
## recursos_de_pessoas_fisicas                91.04
## media_receita                              81.48
## quantidade_fornecedores                    74.19
## quantidade_despesas                        73.98
## quantidade_doadores                        47.20
## quantidade_doacoes                         47.05
## media_despesa                              46.76
## recursos_de_partido_politico               44.60
## recursos_de_outros_candidatos.comites      40.22
## recursos_proprios                          36.43
## grau                                       28.73
## uf                                         26.86
## partido                                    23.78
## estado_civil                               22.32
## ano                                         0.00

Logistic Regression

varImp(model.logistic_reg)
## ROC curve variable importance
## 
##                                       Importance
## recursos_de_pessoas_juridicas             100.00
## recursos_de_pessoas_fisicas                91.04
## media_receita                              81.48
## quantidade_fornecedores                    74.19
## quantidade_despesas                        73.98
## quantidade_doadores                        47.20
## quantidade_doacoes                         47.05
## media_despesa                              46.76
## recursos_de_partido_politico               44.60
## recursos_de_outros_candidatos.comites      40.22
## recursos_proprios                          36.43
## grau                                       28.73
## uf                                         26.86
## partido                                    23.78
## estado_civil                               22.32
## ano                                         0.00

Decision Tree

varImp(model.tree_dec)
## rpart variable importance
## 
##   only 20 most important variables shown (out of 78)
## 
##                                       Overall
## recursos_de_pessoas_fisicas            100.00
## quantidade_doacoes                      89.12
## recursos_de_pessoas_juridicas           78.10
## quantidade_fornecedores                 61.01
## quantidade_despesas                     58.30
## quantidade_doadores                     32.17
## recursos_de_partido_politico            31.33
## recursos_de_outros_candidatos.comites   30.86
## partidoPCO                               0.00
## media_despesa                            0.00
## ufAL                                     0.00
## `grauLÊ E ESCREVE`                       0.00
## partidoPSB                               0.00
## ufSE                                     0.00
## `grauSUPERIOR COMPLETO`                  0.00
## partidoPSOL                              0.00
## ufAM                                     0.00
## ufSP                                     0.00
## partidoPTC                               0.00
## ufPB                                     0.00

AdaBoost

varImp(model.adaboost)
## ROC curve variable importance
## 
##                                       Importance
## recursos_de_pessoas_juridicas             100.00
## recursos_de_pessoas_fisicas                89.00
## media_receita                              81.08
## quantidade_fornecedores                    75.39
## quantidade_despesas                        74.89
## quantidade_doadores                        47.81
## quantidade_doacoes                         47.38
## media_despesa                              47.32
## recursos_de_partido_politico               44.40
## recursos_proprios                          38.41
## recursos_de_outros_candidatos.comites      37.65
## grau                                       30.53
## uf                                         26.91
## partido                                    24.10
## estado_civil                               22.89
## ano                                         0.00

Kaggle challenge

As far we can see for ano and sexo we’ve a low outcome for importance, so those variables should be removed.

As propose in the activite we are going to use our best model to submite the votos prediction to the challenge in Kaggle.

model.adaboost $xlevels[["ocupacao"]] <- union(model.adaboost$xlevels[["ocupacao"]], levels(test.kaggle$ocupacao))
prediction_ <- predict(model.adaboost , test.kaggle)
ID <- test.kaggle %>%
  select(sequencial_candidato)
colnames(ID)[colnames(ID)=="sequencial_candidato"] <- "ID"
predicted_file <- ID
predicted_file$situacao <- prediction_
predicted_file$situacao[predicted_file$situacao < 0] <- 0
write.csv(predicted_file, "sample_submission.csv", row.names=FALSE)

usefull links: http://www.treselle.com/blog/handle-class-imbalance-data-with-r/ https://www.analyticsvidhya.com/blog/2016/03/practical-guide-deal-imbalanced-classification-problems/ https://shiring.github.io/machine_learning/2017/04/02/unbalanced